2. Correlaciones, tratamiento de missing y outliers:¶

Importo librerías:¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
#import warnings

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

Funciones:¶

In [2]:
import funciones_auxiliares as f_aux

Lectura de datos del preprocesado inicial:¶

In [3]:
df_fraud = pd.read_csv("../data/df_data_initial_preprocessing.csv").drop("Unnamed: 0", axis = 1)

df_fraud.shape
Out[3]:
(1000000, 32)
In [4]:
df_fraud
Out[4]:
intended_balcon_amount prev_address_months_count bank_months_count current_address_months_count session_length_in_minutes device_distinct_emails_8w fraud_bool foreign_request phone_mobile_valid has_other_cards proposed_credit_limit device_os source housing_status keep_alive_session device_fraud_count phone_home_valid credit_risk_score email_is_free income employment_status date_of_birth_distinct_emails_4w bank_branch_count_8w velocity_4w velocity_24h velocity_6h zip_count_4w payment_type days_since_request customer_age name_email_similarity month
0 NaN NaN 24.0 88.0 3.888115 1.0 1 0.0 0.0 0.0 500.0 windows INTERNET BA 0.0 0.0 1.0 185.0 0.0 0.9 CA 6.0 1.0 3863.647740 3134.319630 10650.765523 769.0 AA 0.020925 50.0 0.166828 7.0
1 NaN NaN 15.0 144.0 31.798819 1.0 1 0.0 0.0 0.0 1500.0 windows INTERNET BA 0.0 0.0 0.0 259.0 1.0 0.9 CA 3.0 718.0 3124.298166 2670.918292 534.047319 366.0 AB 0.005418 50.0 0.296286 7.0
2 NaN NaN NaN 132.0 4.728705 1.0 1 0.0 1.0 0.0 200.0 other INTERNET BA 0.0 0.0 0.0 177.0 1.0 0.9 CB 14.0 1.0 3159.590679 2893.621498 4048.534263 870.0 AC 3.108549 40.0 0.044985 7.0
3 NaN NaN 31.0 22.0 2.047904 1.0 1 0.0 1.0 1.0 200.0 linux INTERNET BA 0.0 0.0 0.0 110.0 1.0 0.9 CA 6.0 1921.0 3022.261812 4054.908412 3457.064063 810.0 AB 0.019079 50.0 0.159511 7.0
4 NaN NaN 31.0 218.0 3.775225 1.0 1 0.0 0.0 0.0 1500.0 macintosh INTERNET BA 1.0 0.0 1.0 295.0 1.0 0.9 CA 2.0 1990.0 3087.670952 2728.237159 5020.341679 890.0 AB 0.004441 50.0 0.596414 7.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
999995 NaN NaN 25.0 104.0 8.511502 1.0 0 0.0 1.0 0.0 200.0 linux INTERNET BC 1.0 0.0 1.0 75.0 1.0 0.6 CA 8.0 1.0 4972.635997 8341.468557 7905.711839 804.0 AB 0.030592 40.0 0.192631 4.0
999996 NaN 148.0 NaN 9.0 8.967865 1.0 0 0.0 1.0 0.0 200.0 windows INTERNET BC 0.0 0.0 1.0 154.0 1.0 0.8 CC 2.0 0.0 5022.728108 4955.170808 5391.470463 3306.0 AC 1.628119 50.0 0.322989 4.0
999997 34.692760 NaN 11.0 30.0 8.195531 1.0 0 0.0 1.0 0.0 200.0 other INTERNET BC 0.0 0.0 0.0 64.0 0.0 0.8 CF 6.0 2023.0 4377.196321 5670.654316 8063.102636 1522.0 AA 0.018563 20.0 0.879403 4.0
999998 94.661055 NaN 28.0 189.0 4.336064 1.0 0 0.0 0.0 0.0 500.0 windows INTERNET BA 1.0 0.0 1.0 163.0 0.0 0.9 CA 6.0 1678.0 4394.803296 3982.582204 8092.641762 1418.0 AA 0.015352 20.0 0.762112 4.0
999999 9.908499 NaN 15.0 321.0 6.717022 1.0 0 0.0 1.0 0.0 200.0 linux INTERNET BE 0.0 0.0 0.0 36.0 1.0 0.2 CA 12.0 2.0 4352.334543 3695.308261 6169.630036 951.0 AA 2.655916 20.0 0.697452 4.0

1000000 rows × 32 columns

In [5]:
df_fraud.head()
Out[5]:
intended_balcon_amount prev_address_months_count bank_months_count current_address_months_count session_length_in_minutes device_distinct_emails_8w fraud_bool foreign_request phone_mobile_valid has_other_cards proposed_credit_limit device_os source housing_status keep_alive_session device_fraud_count phone_home_valid credit_risk_score email_is_free income employment_status date_of_birth_distinct_emails_4w bank_branch_count_8w velocity_4w velocity_24h velocity_6h zip_count_4w payment_type days_since_request customer_age name_email_similarity month
0 NaN NaN 24.0 88.0 3.888115 1.0 1 0.0 0.0 0.0 500.0 windows INTERNET BA 0.0 0.0 1.0 185.0 0.0 0.9 CA 6.0 1.0 3863.647740 3134.319630 10650.765523 769.0 AA 0.020925 50.0 0.166828 7.0
1 NaN NaN 15.0 144.0 31.798819 1.0 1 0.0 0.0 0.0 1500.0 windows INTERNET BA 0.0 0.0 0.0 259.0 1.0 0.9 CA 3.0 718.0 3124.298166 2670.918292 534.047319 366.0 AB 0.005418 50.0 0.296286 7.0
2 NaN NaN NaN 132.0 4.728705 1.0 1 0.0 1.0 0.0 200.0 other INTERNET BA 0.0 0.0 0.0 177.0 1.0 0.9 CB 14.0 1.0 3159.590679 2893.621498 4048.534263 870.0 AC 3.108549 40.0 0.044985 7.0
3 NaN NaN 31.0 22.0 2.047904 1.0 1 0.0 1.0 1.0 200.0 linux INTERNET BA 0.0 0.0 0.0 110.0 1.0 0.9 CA 6.0 1921.0 3022.261812 4054.908412 3457.064063 810.0 AB 0.019079 50.0 0.159511 7.0
4 NaN NaN 31.0 218.0 3.775225 1.0 1 0.0 0.0 0.0 1500.0 macintosh INTERNET BA 1.0 0.0 1.0 295.0 1.0 0.9 CA 2.0 1990.0 3087.670952 2728.237159 5020.341679 890.0 AB 0.004441 50.0 0.596414 7.0
In [6]:
df_fraud.columns
Out[6]:
Index(['intended_balcon_amount', 'prev_address_months_count',
       'bank_months_count', 'current_address_months_count',
       'session_length_in_minutes', 'device_distinct_emails_8w', 'fraud_bool',
       'foreign_request', 'phone_mobile_valid', 'has_other_cards',
       'proposed_credit_limit', 'device_os', 'source', 'housing_status',
       'keep_alive_session', 'device_fraud_count', 'phone_home_valid',
       'credit_risk_score', 'email_is_free', 'income', 'employment_status',
       'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w',
       'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w',
       'payment_type', 'days_since_request', 'customer_age',
       'name_email_similarity', 'month'],
      dtype='object')
In [7]:
categoricals_list, other = f_aux.dame_variables_categoricas(dataset = df_fraud)
df_fraud[categoricals_list] = df_fraud[categoricals_list].astype("category")
continuous_list = list(df_fraud.select_dtypes("float").columns)
df_fraud[continuous_list] = df_fraud[continuous_list].astype(float)
df_fraud.dtypes #aquí estamos comprobando en todas las columnas de qué tipo son
Out[7]:
intended_balcon_amount               float64
prev_address_months_count            float64
bank_months_count                    float64
current_address_months_count         float64
session_length_in_minutes            float64
device_distinct_emails_8w            float64
fraud_bool                          category
foreign_request                      float64
phone_mobile_valid                   float64
has_other_cards                      float64
proposed_credit_limit                float64
device_os                           category
source                              category
housing_status                      category
keep_alive_session                   float64
device_fraud_count                   float64
phone_home_valid                     float64
credit_risk_score                    float64
email_is_free                        float64
income                               float64
employment_status                   category
date_of_birth_distinct_emails_4w     float64
bank_branch_count_8w                 float64
velocity_4w                          float64
velocity_24h                         float64
velocity_6h                          float64
zip_count_4w                         float64
payment_type                        category
days_since_request                   float64
customer_age                         float64
name_email_similarity                float64
month                                float64
dtype: object

Separación en train y test estratificado:¶

In [8]:
# Calculate percentages and reset index for the first DataFrame
df_fraud_bool = df_fraud["fraud_bool"]\
    .value_counts(normalize = True)\
    .mul(100).rename("percent").reset_index()

# Calculate counts and reset index for the second DataFrame
df_fraud_bool_conteo = df_fraud["fraud_bool"].value_counts().reset_index()

# Merge the two DataFrames on the "fraud_bool" column
df_fraud_bool_pc = pd.merge(df_fraud_bool, df_fraud_bool_conteo, on = "fraud_bool", how = 'inner')\
        .rename(columns = {"fraud_bool": "index"})
df_fraud_bool_pc


fig = px.histogram(df_fraud_bool_pc, x = "index", y = ['percent'])
fig.update_xaxes(tickvals = [0, 1])
fig.show()
In [9]:
from sklearn.model_selection import train_test_split
x_df_fraud, x_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(
                                                                    df_fraud.drop("fraud_bool",axis=1), 
                                                                    df_fraud["fraud_bool"], 
                                                                    stratify = df_fraud["fraud_bool"], 
                                                                    test_size = 0.2)
df_fraud_train = pd.concat([x_df_fraud, y_df_fraud], axis = 1)
df_fraud_test = pd.concat([x_df_fraud_test, y_df_fraud_test], axis = 1)
df_fraud_test
Out[9]:
intended_balcon_amount prev_address_months_count bank_months_count current_address_months_count session_length_in_minutes device_distinct_emails_8w foreign_request phone_mobile_valid has_other_cards proposed_credit_limit device_os source housing_status keep_alive_session device_fraud_count phone_home_valid credit_risk_score email_is_free income employment_status date_of_birth_distinct_emails_4w bank_branch_count_8w velocity_4w velocity_24h velocity_6h zip_count_4w payment_type days_since_request customer_age name_email_similarity month fraud_bool
164893 NaN NaN NaN 34.0 6.284015 1.0 0.0 1.0 0.0 500.0 other INTERNET BA 0.0 0.0 0.0 158.0 0.0 0.1 CA 8.0 0.0 4627.661949 5892.208076 6300.352459 1066.0 AC 14.278374 30.0 0.872882 3.0 0
259912 NaN NaN 19.0 197.0 3.463159 1.0 0.0 1.0 0.0 200.0 macintosh INTERNET BB 1.0 0.0 0.0 53.0 0.0 0.1 CA 13.0 12.0 5136.370224 5662.654955 8540.003798 829.0 AD 0.038717 20.0 0.425354 2.0 0
158133 26.750203 NaN 31.0 184.0 3.512410 1.0 0.0 1.0 1.0 1500.0 other INTERNET BA 0.0 0.0 0.0 185.0 0.0 0.8 CA 12.0 354.0 5032.692675 4076.097781 4712.130828 1966.0 AA 0.009290 30.0 0.127620 3.0 0
974829 NaN 41.0 1.0 NaN 2.965838 1.0 0.0 1.0 0.0 200.0 other INTERNET BC 1.0 0.0 0.0 144.0 0.0 0.8 CA 8.0 8.0 3107.382105 2828.137810 3993.961844 1422.0 AD 0.015090 30.0 0.098489 4.0 0
900372 NaN NaN 1.0 11.0 4.969989 1.0 0.0 0.0 1.0 200.0 linux INTERNET BC 0.0 0.0 0.0 130.0 1.0 0.5 CA 14.0 6.0 4759.997038 5400.575658 10078.408844 2077.0 AB 0.006393 30.0 0.851772 4.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
798488 NaN NaN 25.0 30.0 2.140956 1.0 0.0 1.0 0.0 200.0 linux INTERNET BB 0.0 0.0 0.0 70.0 1.0 0.6 CA 19.0 14.0 4411.063001 7368.651690 4045.938748 1540.0 AB 0.001100 30.0 0.655980 5.0 0
216923 NaN NaN 15.0 119.0 1.372113 1.0 0.0 1.0 0.0 200.0 other INTERNET BA 0.0 0.0 1.0 172.0 0.0 0.8 CA 4.0 13.0 4850.387098 5556.945586 3468.782135 1490.0 AB 0.010512 40.0 0.087478 3.0 0
38651 NaN 28.0 28.0 6.0 7.358852 1.0 0.0 0.0 0.0 990.0 windows INTERNET BB 1.0 0.0 1.0 117.0 1.0 0.1 CA 10.0 15.0 3046.269860 2193.108049 1639.216399 630.0 AD 0.000322 30.0 0.708867 7.0 0
855203 52.649272 NaN 1.0 178.0 6.331757 1.0 0.0 1.0 1.0 1500.0 other INTERNET BE 0.0 0.0 0.0 228.0 1.0 0.9 CA 2.0 19.0 4169.087757 4579.886623 4325.898342 735.0 AA 0.016136 30.0 0.788005 5.0 0
677134 NaN 11.0 26.0 8.0 3.204506 2.0 0.0 1.0 0.0 200.0 linux INTERNET BC 0.0 0.0 1.0 89.0 1.0 0.3 CA 14.0 4.0 5986.894001 5613.811715 4928.563151 1441.0 AB 0.010249 30.0 0.546968 0.0 0

200000 rows × 32 columns

In [10]:
print("", '== Train\n', df_fraud_train["fraud_bool"].value_counts(normalize = True))
print("""
""", '== Test\n', df_fraud_test["fraud_bool"].value_counts(normalize = True))
 == Train
 fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64

 == Test
 fraud_bool
0    0.98897
1    0.01103
Name: proportion, dtype: float64

Visualización descriptiva de los datos:¶

In [11]:
fraud_series_null_columns = df_fraud_train.isnull().sum().sort_values(ascending = False)
fraud_series_null_rows = df_fraud_train.isnull().sum(axis = 1).sort_values(ascending = False)

df_null_columns = pd.DataFrame(fraud_series_null_columns, columns = ["null_columns"])
df_null_rows = pd.DataFrame(fraud_series_null_rows, columns = ["null_rows"])

df_null_rows["target"] = df_fraud["fraud_bool"]
df_null_columns["column_percentage"] = df_null_columns["null_columns"] / df_fraud.shape[0]
df_null_rows["row_percentage"] = df_null_rows["null_rows"] / df_fraud.shape[1]

df_null_columns.head(6)
Out[11]:
null_columns column_percentage
intended_balcon_amount 593888 0.593888
prev_address_months_count 570093 0.570093
bank_months_count 202769 0.202769
current_address_months_count 3427 0.003427
session_length_in_minutes 1582 0.001582
device_distinct_emails_8w 289 0.000289

intended_balcon_amount: Cerca del 59.45% de los valores en esta columna son nulos.

prev_address_months_count: Alrededor del 57.04% de los valores en esta columna son nulos.

bank_months_count: Aproximadamente el 20.33% de los valores en esta columna son nulos.

current_address_months_count: Un 0.34% de los valores en esta columna son nulos.

session_length_in_minutes: Un 0.16% de los valores en esta columna son nulos.

device_distinct_emails_8w: Solo alrededor del 0.03% de los valores en esta columna son nulos.

In [12]:
len(df_fraud_train.columns)
Out[12]:
32
In [13]:
len(df_fraud_train.columns[:16])
Out[13]:
16
In [14]:
len(df_fraud_train.columns[16:])
Out[14]:
16
In [15]:
for i in list(df_fraud_train.columns[:16]):
    if (df_fraud_train[i].dtype == float) & (i != "fraud_bool"):
        f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = True, target = "fraud_bool")
    elif  i != "fraud_bool":
        f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = False, target = "fraud_bool")
In [16]:
for i in list(df_fraud_train.columns[16:]):
    if (df_fraud_train[i].dtype == float) & (i != "fraud_bool"):
        f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = True, target = "fraud_bool")
    elif  i != "fraud_bool":
        f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = False, target = "fraud_bool")

Tratamiento de las variables continuas:¶

In [17]:
continuous_list
Out[17]:
['intended_balcon_amount',
 'prev_address_months_count',
 'bank_months_count',
 'current_address_months_count',
 'session_length_in_minutes',
 'device_distinct_emails_8w',
 'foreign_request',
 'phone_mobile_valid',
 'has_other_cards',
 'proposed_credit_limit',
 'keep_alive_session',
 'device_fraud_count',
 'phone_home_valid',
 'credit_risk_score',
 'email_is_free',
 'income',
 'date_of_birth_distinct_emails_4w',
 'bank_branch_count_8w',
 'velocity_4w',
 'velocity_24h',
 'velocity_6h',
 'zip_count_4w',
 'days_since_request',
 'customer_age',
 'name_email_similarity',
 'month']

Tratamiento de outliers:¶

In [18]:
f_aux.get_deviation_of_mean_perc(df_fraud_train, continuous_list, target = "fraud_bool", multiplier = 3)
Out[18]:
no_fraud fraud variable sum_outlier_values porcentaje_sum_outlier_values
0 0.989024 0.010976 intended_balcon_amount 1640 0.002050
1 0.993193 0.006807 prev_address_months_count 6905 0.008631
2 0.984446 0.015554 current_address_months_count 16845 0.021056
3 0.980334 0.019666 session_length_in_minutes 18814 0.023518
4 0.961992 0.038008 device_distinct_emails_8w 25179 0.031474
5 0.977977 0.022023 foreign_request 20206 0.025257
6 0.867963 0.132037 proposed_credit_limit 4938 0.006173
7 0.967928 0.032072 credit_risk_score 2775 0.003469
8 0.994164 0.005836 date_of_birth_distinct_emails_4w 4969 0.006211
9 0.989432 0.010568 bank_branch_count_8w 32741 0.040926
10 0.995327 0.004673 velocity_24h 428 0.000535
11 0.992505 0.007495 velocity_6h 3469 0.004336
12 0.990815 0.009185 zip_count_4w 12956 0.016195
13 0.987651 0.012349 days_since_request 14171 0.017714
14 0.957541 0.042459 customer_age 6312 0.007890

En la tabla de arriba podemos ver todos los outliers de todas las columnas contínuas del conjunto de datos. El dataset en su totalidad tiene un porcentaje de fraude de solo 2% y 98% de no tener fraude. Teniendo esto en cuenta, tiene sentido que la mayoria de los outliers tengan un porcentaje de fraude y no fraude parecido al de 2% y 98% respectivamente, pero podemos ver que esto no es lo que se observa en la variable de "proposed_credit_limit" donde cuentas bancarias que han cometido fraude son el 13% de los outliers, por lo que habría que revisar los outliers de esta variable más detenidamente ya que se excede de lo normal.

Además, las siguientes variables son las que tienen un porcentaje de outliers más altos en el conjunto de datos:

bank_branch_count_8w: Cerca del 4.1% de los valores son considerados atípicos.

current_address_months_count: Alrededor del 2.14% de los valores son considerados atípicos.

session_length_in_minutes: Aproximadamente el 2.35% de los valores son considerados atípicos.

device_distinct_emails_8w: Un 3.16% de los valores son considerados atípicos.

proposed_credit_limit: Cerca del 0.61% de los valores son considerados atípicos.

credit_risk_score: Alrededor del 0.35% de los valores son considerados atípicos. Lo mismo con los demás.

Se puede observar que, en las variables de "device_distinct_emails_8w", "credit_risk_score", y especialmente "customer_age", los outlier tienen un porcentaje mucho más alto de haber cometido fraude (36.9%, 33.6%, y 40.1%) comparado con los outliers del resto de las variables contínuas.

Correlaciones:¶

A continuación, correlaciones entre las diferentes variables:

In [19]:
# Correlaciones entre las diferentes variables
f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8,6])
Out[19]:
0
In [20]:
continuous_list.remove('device_fraud_count')

f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8,6])
Out[20]:
0

level_0: Variable en el primer nivel de la correlación.

level_1: Variable en el segundo nivel de la correlación.

Correlation: Valor de correlación entre las dos variables indicadas en level_0 y level_1.

Las correlaciones indican cómo dos variables están relacionadas entre sí. Aquí hay algunas observaciones sobre las correlaciones presentadas:

Comprobando que variables tienen una correlacón entre sí más alta del 0.4%:

Las variables de month y velocity_4w tienen una fuerte correlación positive de 0.85. Además, existe una correlación positiva moderada (0.61) entre credit_risk_score y proposed_credit_limit.

In [21]:
corr = df_fraud_train[continuous_list].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k = -1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation'] > 0.4]
Out[21]:
level_0 level_1 correlation
617 month velocity_4w 0.847791
309 credit_risk_score proposed_credit_limit 0.607147
618 month velocity_24h 0.549711
467 velocity_24h velocity_4w 0.538393
493 velocity_6h velocity_24h 0.463799
565 customer_age date_of_birth_distinct_emails_4w 0.419411
619 month velocity_6h 0.409437
492 velocity_6h velocity_4w 0.400548

Tratamiento de valores nulos:¶

¿Son todos los nulos de una clase de la variable objetivo? o tienen el mismo porcentaje de la variable objetivo?

In [22]:
df_fraud_train["intended_balcon_amount"].sort_index(ascending = True)
Out[22]:
0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
999994          NaN
999995          NaN
999996          NaN
999997    34.692760
999999     9.908499
Name: intended_balcon_amount, Length: 800000, dtype: float64
In [23]:
continuous_list
Out[23]:
['intended_balcon_amount',
 'prev_address_months_count',
 'bank_months_count',
 'current_address_months_count',
 'session_length_in_minutes',
 'device_distinct_emails_8w',
 'foreign_request',
 'phone_mobile_valid',
 'has_other_cards',
 'proposed_credit_limit',
 'keep_alive_session',
 'phone_home_valid',
 'credit_risk_score',
 'email_is_free',
 'income',
 'date_of_birth_distinct_emails_4w',
 'bank_branch_count_8w',
 'velocity_4w',
 'velocity_24h',
 'velocity_6h',
 'zip_count_4w',
 'days_since_request',
 'customer_age',
 'name_email_similarity',
 'month']
In [24]:
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
Out[24]:
no_fraud fraud variable sum_null_values porcentaje_sum_null_values
0 0.986885 0.013115 intended_balcon_amount 593888 0.742360
1 0.985771 0.014229 prev_address_months_count 570093 0.712616
2 0.983740 0.016260 bank_months_count 202769 0.253461
3 0.996207 0.003793 current_address_months_count 3427 0.004284
4 0.993047 0.006953 session_length_in_minutes 1582 0.001978
5 0.989619 0.010381 device_distinct_emails_8w 289 0.000361

Imputando los valores nulos de las variables "device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count", y "intended_balcon_amount" por sus medias dado que el porcentaje de nulos a lo largo del conjunto de datos es bajo en las cuatro variables:

In [25]:
df_fraud[["device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count", 
        "bank_months_count"]] = df_fraud[["device_distinct_emails_8w", "session_length_in_minutes", 
        "current_address_months_count", "bank_months_count"]].apply(lambda x: x.fillna(x.mean()))
In [26]:
f_aux.get_percent_null_values_target(df_fraud, continuous_list, target = "fraud_bool")
Out[26]:
no_fraud fraud variable sum_null_values porcentaje_sum_null_values
0 0.986873 0.013127 intended_balcon_amount 742523 0.742523
1 0.985785 0.014215 prev_address_months_count 712920 0.712920
2 NaN NaN bank_months_count 0 0.000000
3 NaN NaN current_address_months_count 0 0.000000
4 NaN NaN session_length_in_minutes 0 0.000000
5 NaN NaN device_distinct_emails_8w 0 0.000000

Dado que las otras dos variables con nulos tienen un porcentaje de elementos igual a nulo muy alto, uso el modelo de regresión de KNNImputer para imputar los valores missing de estas variables ("bank_months_count" y "prev_address_months_count") usando como regresoras todas las variables continuas:

In [27]:
df_fraud_train[["device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count", 
        "bank_months_count"]] = df_fraud_train[["device_distinct_emails_8w", "session_length_in_minutes", 
        "current_address_months_count", "bank_months_count"]].apply(lambda x: x.fillna(x.mean()))
In [28]:
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
Out[28]:
no_fraud fraud variable sum_null_values porcentaje_sum_null_values
0 0.986885 0.013115 intended_balcon_amount 593888 0.742360
1 0.985771 0.014229 prev_address_months_count 570093 0.712616
2 NaN NaN bank_months_count 0 0.000000
3 NaN NaN current_address_months_count 0 0.000000
4 NaN NaN session_length_in_minutes 0 0.000000
5 NaN NaN device_distinct_emails_8w 0 0.000000

Idealmente, en el tratamiento de datos nulos, usariamos el código de debajo para imputar los valores nulos de las dos columnas que tienen un porcentaje de nulos de más de 70% con el modelo de regresión de KNN Imputer, usando como regresoras todas la variables contínuas. Esto sería porque los valores que imputaría la regression KNN en vez de los nulos ayudarían a crear modelos mucho más precisos que si simplemente imputaramos la media de cada columna donde se encuentren los nulos. Sin embargo, como la celda de código de debajo tiene un tiempo de ejecución extremadamente largo, esto segundo es justo lo que vamos a hacer para poder crear predicciones (aunque vayan a ser menos precisas de lo que sería teóricamente posible),

In [29]:
#X_train = df_fraud_train[continuous_list]
#X_test = df_fraud_test_2[continuous_list]
#imputer = KNNImputer(n_neighbors = 2, weights = "uniform")
#model = imputer.fit(X_train)
#df_input_train = pd.DataFrame(model.transform(X_train), 
#                      columns = [i+'_input' for i in list(set(continuous_list))],index=df_fraud_train.index)
#df_input_test = pd.DataFrame(model.transform(X_test), 
#                      columns = [i+'_input' for i in list(set(continuous_list))],index=df_fraud_test_2.index)
In [30]:
df_fraud[continuous_list] = df_fraud[continuous_list].apply(lambda x: x.fillna(x.mean()))

x_df_fraud, x_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(
                                                                    df_fraud.drop("fraud_bool",axis=1), 
                                                                    df_fraud["fraud_bool"], 
                                                                    stratify = df_fraud["fraud_bool"], 
                                                                    test_size = 0.2)
df_fraud_train = pd.concat([x_df_fraud, y_df_fraud], axis = 1)
df_fraud_test = pd.concat([x_df_fraud_test, y_df_fraud_test], axis = 1)

f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8, 6])
Out[30]:
0
In [31]:
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
No existen variables con valores nulos

Tratamiento de las variables categoricas:¶

Para la correlacion de spearman es necesario convertir las variables categoricas en numericas y luego obtener la correlación

In [32]:
categoricals_list
Out[32]:
['fraud_bool',
 'device_os',
 'source',
 'housing_status',
 'employment_status',
 'payment_type']
In [33]:
def cramers_v(confusion_matrix):
    """ 
    calculate Cramers V statistic for categorical-categorical association.
    uses correction from Bergsma and Wicher,
    Journal of the Korean Statistical Society 42 (2013): 323-328
    
    confusion_matrix tabla creada con pd.crosstab()
    
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
In [34]:
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["device_os"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
device_os    linux  macintosh   other  windows   x11
fraud_bool                                          
0           264804      42377  272500   205771  5725
1             1357        600    1572     5227    67
Out[34]:
0.0809818636850644
In [35]:
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["fraud_bool"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
fraud_bool       0     1
fraud_bool              
0           791177     0
1                0  8823
Out[35]:
0.9999426978916621
In [36]:
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["source"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
source      INTERNET  TELEAPP
fraud_bool                   
0             785574     5603
1               8727       96
Out[36]:
0.0045096844811087725
In [37]:
categoricals_list
Out[37]:
['fraud_bool',
 'device_os',
 'source',
 'housing_status',
 'employment_status',
 'payment_type']
In [38]:
df_fraud["payment_type"].unique()
Out[38]:
['AA', 'AB', 'AC', 'AD', 'AE']
Categories (5, object): ['AA', 'AB', 'AC', 'AD', 'AE']
In [39]:
device_map = {'windows': 1, 'other': 2, 'linux': 3, 'macintosh': 4, "x11": 5}
source_map = {"INTERNET": 1, "TELEAPP": 2}
housing_map = {'BA': 1, 'BB': 2, 'BC': 3, 'BD': 4, "BE": 5, 'BF': 6, 'BG': 7}
employment_map = {'CA': 1, 'CB': 2, 'CC': 3, 'CD': 4, "CE": 5, 'CF': 6, 'CG': 7}
payment_map = {'AA': 1, 'AB': 2, 'AC': 3, 'AD': 4, "AE": 5, 'AF': 6, 'AG': 7}

df_fraud['device_os'] = df_fraud['device_os'].map(device_map).astype("float64")
df_fraud['source'] = df_fraud['source'].map(source_map).astype("float64")
df_fraud['housing_status'] = df_fraud['housing_status'].map(housing_map).astype("float64")
df_fraud['employment_status'] = df_fraud['employment_status'].map(employment_map).astype("float64")
df_fraud['payment_type'] = df_fraud['payment_type'].map(payment_map).astype("float64")
In [40]:
f_aux.get_corr_matrix(dataset = df_fraud[categoricals_list], metodo='spearman', size_figure=[5,4])
Out[40]:
0

Tratamiento de valores nulos:¶

No hay valores nulos en las variables categoricas de este conjunto de datos, pero si hubiera hariamos lo siguiente:

In [41]:
df_fraud_train[categoricals_list] = df_fraud_train[categoricals_list].astype("object")\
.fillna("SIN VALOR").astype("category")

df_fraud_test[categoricals_list] = df_fraud_test[categoricals_list].astype("object").\
fillna("SIN VALOR").astype("category")

Guardando las tablas de train y test:¶

In [42]:
df_fraud_train.to_csv("../data/train_df_data_preprocessing_missing_outlier.csv")
df_fraud_test.to_csv("../data/test_df_data_preprocessing_missing_outlier.csv")